#Load & Clean Dataset
diabetes = 
  read_csv("Diabete datasets/diabetes_012_health_indicators_BRFSS2015.csv") |>
  janitor::clean_names() |> 
  mutate(
    sex = case_match(
      sex,
      0 ~ "Female",
      1 ~ "Male"
    ),
    sex = factor(sex),
    age = case_match(
      as.numeric(age),
      1 ~ "18-24",
      2 ~ "25–29",
      3 ~ "30–34",
      4 ~ "35–39",
      5 ~ "40–44",
      6 ~ "45-49",
      7 ~ "50-54",
      8 ~ "55-59",
      9 ~ "60-64",
      10 ~ "65-69",
      11 ~ "70-74",
      12 ~ "75-79",
      13 ~ "80+"
    ),
    age = 
      factor(age, 
             levels = 
               c("18-24","25–29", "30–34", "35–39",
                 "40–44","45-49","50-54","55-59",
                 "60-64","65-69","70-74","75-79","80+"), 
             ordered = TRUE),
    education = case_match(
      as.numeric(education),
      1 ~ "Never attended school",
      2 ~ "Grade 1-8",
      3 ~ "Grade 9-11",
      4 ~ "High school graduate",
      5 ~ "College 1-3",
      6 ~ "College graduate"
     ),
    education = 
      factor(education, 
             levels = 
               c("Never attended school","Grade 1-8", "Grade 9-11",
                 "High school graduate","College 1-3",
                 "College graduate"), 
             ordered = TRUE),
    income = case_match(
      as.numeric(income),
      1 ~ "Less than $10,000",
      2 ~ "$10,000 - $15,000",
      3 ~ "$15,000 - $20,000",
      4 ~ "$20,000 - $25,000",
      5 ~ "$25,000 - $35,000",
      6 ~ "$35,000 - $50,000",
      7 ~ "$50,000 - $75,000",
      8 ~ "$75,000 +"
    ),
    income = 
      factor(income, 
             levels = 
               c("Less than $10,000","$10,000 - $15,000",
                 "$15,000 - $20,000","$20,000 - $25,000",
                 "$25,000 - $35,000","$35,000 - $50,000",
                 "$50,000 - $75,000","$75,000 +"), 
             ordered = TRUE),
    gen_hlth = case_match(
      as.numeric(gen_hlth),
      1 ~ "Poor",
      2 ~ "Fair",
      3 ~ "Good",
      4 ~ "Very Good",
      5 ~ "Excellent"
    ),
    gen_hlth = 
      factor(gen_hlth, 
             levels = 
               c("Poor", "Fair", "Good", "Very Good", "Excellent"), 
             ordered = TRUE),
    diabetes_012 = 
      factor(diabetes_012,
             levels = c(0,1,2),
             labels = c("No Diabetes", "Prediabetes", "Diabetes"))
  )
## Rows: 253680 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (22): Diabetes_012, HighBP, HighChol, CholCheck, BMI, Smoker, Stroke, He...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
binary_vars = c(
  "high_bp", "high_chol", "chol_check",
  "smoker", "stroke", "heart_diseaseor_attack",
  "phys_activity", "hvy_alcohol_consump", "any_healthcare",
  "no_docbc_cost", "diff_walk", "fruits", "veggies"
)

diabetes[binary_vars] = lapply(
  diabetes[binary_vars],
  function(x) factor(ifelse(x == 1, "Yes", "No"))
)

Data have 253680 records and 22 variables. Each record contains an individual’s BRFSS survey responses.

#data summary
library(skimr)
skim(diabetes)
Data summary
Name diabetes
Number of rows 253680
Number of columns 22
_______________________
Column type frequency:
factor 19
numeric 3
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
diabetes_012 0 1 FALSE 3 No : 213703, Dia: 35346, Pre: 4631
high_bp 0 1 FALSE 2 No: 144851, Yes: 108829
high_chol 0 1 FALSE 2 No: 146089, Yes: 107591
chol_check 0 1 FALSE 2 Yes: 244210, No: 9470
smoker 0 1 FALSE 2 No: 141257, Yes: 112423
stroke 0 1 FALSE 2 No: 243388, Yes: 10292
heart_diseaseor_attack 0 1 FALSE 2 No: 229787, Yes: 23893
phys_activity 0 1 FALSE 2 Yes: 191920, No: 61760
fruits 0 1 FALSE 2 Yes: 160898, No: 92782
veggies 0 1 FALSE 2 Yes: 205841, No: 47839
hvy_alcohol_consump 0 1 FALSE 2 No: 239424, Yes: 14256
any_healthcare 0 1 FALSE 2 Yes: 241263, No: 12417
no_docbc_cost 0 1 FALSE 2 No: 232326, Yes: 21354
gen_hlth 0 1 TRUE 5 Fai: 89084, Goo: 75646, Poo: 45299, Ver: 31570
diff_walk 0 1 FALSE 2 No: 211005, Yes: 42675
sex 0 1 FALSE 2 Fem: 141974, Mal: 111706
age 0 1 TRUE 13 60-: 33244, 65-: 32194, 55-: 30832, 50-: 26314
education 0 1 TRUE 6 Col: 107325, Col: 69910, Hig: 62750, Gra: 9478
income 0 1 TRUE 8 $75: 90385, $50: 43219, $35: 36470, $25: 25883

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
bmi 0 1 28.38 6.61 12 24 27 31 98 ▇▅▁▁▁
ment_hlth 0 1 3.18 7.41 0 0 0 2 30 ▇▁▁▁▁
phys_hlth 0 1 4.24 8.72 0 0 0 3 30 ▇▁▁▁▁

Diabetes distribution

diabetes |>  
  count(diabetes_012) |>  
  plot_ly(
    x = ~diabetes_012,
    y = ~n,
    type = "bar",
    color = ~diabetes_012
  ) |>  
  layout(
    barmode = "stack",
    title = "Diabetes Prevalence",
    xaxis = list(title = "Age Group"),
    yaxis = list(title = "Count")
  )

Sex & Age distribution

diabetes |>
  count(sex, age) |>  
  plot_ly(
    x = ~age,
    y = ~n,
    type = "bar",
    color = ~sex,
    colors = c("tomato","skyblue")
  ) %>% 
  layout(
    barmode = "group",
    title = "Sex and Age Group Distribution",
    xaxis = list(title = "Age Group"),
    yaxis = list(title = "Count")
  )
diabetes |>
  group_by(age, sex) |> 
  summarise(diabetes_rate = mean(diabetes_012 == "Diabetes")) |>  
  plot_ly(
    x = ~age,
    y = ~diabetes_rate,
    color = ~sex,
    colors = c("tomato","skyblue"),
    type = "scatter",
    mode = "lines+markers"
  ) |>  
  layout(
    title = "Diabetes Prevalence by Sex and Age Group",
    xaxis = list(title = "Age Group"),
    yaxis = list(title = "Diabetes Rate")
  )
## `summarise()` has grouped output by 'age'. You can override using the `.groups`
## argument.

BMI Distribution

plot_ly(
  data = diabetes,
  x = ~diabetes_012,
  y = ~bmi,
  color = ~diabetes_012,
  type = "box"
  ) |> 
  layout(
    title = "BMI Distribution by Diabetes Status",
    xaxis = list(title = "Diabetes Status"),
    yaxis = list(title = "BMI")
  )